# General packages for stuff
library(tidyverse)
library(here)
library(janitor)
library(plotly)
# Packages for spatial stuff & point pattern analysis
library(tmap)
library(sf)
library(spatstat)
library(maptools)
library(sp)
library(raster)
# Packages for cluster analysis:
library(NbClust)
library(cluster)
library(factoextra)
library(dendextend)
library(ggdendro)
voles <- read_sf(dsn = here("data","redtreevoledata"),
layer = "ds033") %>%
dplyr::select(COUNTY) %>%
filter(COUNTY == "HUM") %>%
st_transform(crs = 4326)
# plot(voles)
# Get Humboldt County outline
humboldt <- read_sf(dsn = here("data","redtreevoledata"),
layer = "california_county_shape_file") %>%
filter(NAME == "Humboldt") %>%
dplyr::select(NAME)
st_crs(humboldt) <- 4326
# plot(humboldt)
# Plot them together:
tm_shape(humboldt) +
tm_fill() +
tm_shape(voles) +
tm_dots(size = 0.2)
# Or with ggplot2:
ggplot() +
geom_sf(data = humboldt) +
geom_sf(data = voles)
# Or save it:
ggsave(here("figures","humvoles.png"),
units = "in",
width = 4,
height = 6,
dpi = 300)
# Another example (with tiff...there's also jpeg, png, etc.)
# tiff("humvoles2.tiff", units = "in", width = 5, height = 5, res = 300)
# Customize it however you want:
ggplot() +
geom_sf(data = humboldt, fill = "black") +
geom_sf(data = voles, color = "red", alpha = 0.5)
# dev.off()
We want to explore point patterns in a few different ways. Quadrat analysis, nearest neighbor analysis, etc. to compare with.
First we need to convert to ‘ppp’ and ‘owin’ - the points and windows, as used by maptools and spatstat (because sf is still catching up for raster and point pattern analysis stuff)…
## UPDATE!
voles_sp <- as(voles,"Spatial")
voles_ppp <- as(voles_sp, "ppp")
# Breaks in 244 Winter 2020
# Projection issue - switch to 6345 above and this works KIND OF -- but seems problematic. Need to fix for 244 Winter 2021!!!
humboldt_sp <- as(humboldt, "Spatial")
humboldt_win <- as(humboldt_sp, "owin")
voles_pb <- ppp(voles_ppp$x, voles_ppp$y, window = humboldt_win)
plot(voles_pb)
vole_qt <- quadrat.test(voles_pb, nx = 5, ny = 10) # nx and ny are number of columns/rows for the rectangles created
# Returns: VoleQT
# Chi-squared test of CSR using quadrat counts
# data: VolePPP
# X-squared = 425.94, df = 45, p-value < 2.2e-16
# alternative hypothesis: two.sided
# Reject the null hypothesis of spatial evenness! But we still don't know if more clustered or more uniform...
plot(voles_pb)
plot(vole_qt, add = TRUE, cex = 0.4)
Plot densities:
point_density <- density(voles_pb, sigma = 0.02)
plot(point_density)
# Can you start viewing this in tmap? Yes, rasterize it:
wgs84 = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
vole_raster <- raster(point_density, crs = wgs84)
# Then plot:
tm_shape(vole_raster) +
tm_raster(midpoint = NA,
palette = "Blues",
legend.show = FALSE)
Nearest neighbor (G-function)
r <- seq(0,0.15, by = 0.005)
gfunction <- envelope(voles_pb, fun = Gest, r = r, nsim = 100, nrank = 2) # Sig level of Monte Carlo = 0.04
## Generating 100 simulations of CSR ...
## 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
## 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
## 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100.
##
## Done.
plot(gfunction$obs ~ gfunction$r, type = "l", col = "black", lty = 11)
lines(gfunction$hi ~ gfunction$r, type = "l", col = "blue", lty = 8)
lines(gfunction$theo ~ gfunction$r, type = "l", col = "red", lty = 6)
lines(gfunction$lo ~ gfunction$r, type = "l", col = "green", lty = 4)
# Confirms, in combination with quadrat.test, clustered data!
Nearest Neighbor by Ripley’s K (using L standardization)
r2 <- seq(0,0.5, by = 0.05)
lfunction <- envelope(voles_pb, fun = Lest, r = r2, nsim = 20, rank = 2, global = TRUE)
## Generating 20 simulations of CSR ...
## 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20.
##
## Done.
plot(lfunction$obs ~ lfunction$r, type = "l", col = "black", lty = 11)
lines(lfunction$hi ~ lfunction$r, type = "l", col = "blue", lty = 8)
lines(lfunction$theo ~ lfunction$r, type = "l", col = "red", lty = 6)
lines(lfunction$lo ~ lfunction$r, type = "l", col = "green", lty = 4)
Diggle-Cressie-Loosmore-Ford test of CSR
DCLFTest <- dclf.test(voles_pb, nsim = 100, rank = 2)
## Generating 100 simulations of CSR ...
## 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
## 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
## 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100.
##
## Done.
DCLFTest
##
## Diggle-Cressie-Loosmore-Ford test of CSR
## Monte Carlo test based on 100 simulations
## Summary function: K(r)
## Reference function: theoretical
## Alternative: two.sided
## Interval of distance values: [0, 0.250888]
## Test statistic: Integral of squared absolute deviation
## Deviation = observed minus theoretical
##
## data: voles_pb
## u = 0.001952, rank = 1, p-value = 0.009901
Nevermind…back to irises dataset?
iris_nice <- iris %>%
clean_names()
ggplot(iris_nice) +
geom_point(aes(x = petal_length, y = petal_width, color = species))
# How many clusters do you THINK there should be?
number_est <- NbClust(iris_nice[1:4], min.nc = 2, max.nc = 10, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 11 proposed 2 as the best number of clusters
## * 11 proposed 3 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
# By these estimators, 2 is the best number of clusters...but should that change our mind? Maybe...
# What if we consider similarities across all four variables?
iris_km <- kmeans(iris_nice[1:4], 3) # kmeans specifying 3 groups!
iris_km$size
## [1] 33 21 96
iris_km$centers
## sepal_length sepal_width petal_length petal_width
## 1 5.175758 3.624242 1.472727 0.2727273
## 2 4.738095 2.904762 1.790476 0.3523810
## 3 6.314583 2.895833 4.973958 1.7031250
iris_km$cluster
## [1] 1 2 2 2 1 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 1 1 1 2 1 1
## [38] 1 2 1 1 2 2 1 1 2 1 2 1 1 3 3 3 3 3 3 3 2 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3
## [75] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3
# Bind the cluster number to the original data
iris_cl <- data.frame(iris_nice, cluster_no = factor(iris_km$cluster))
ggplot(iris_cl) +
geom_point(aes(x = sepal_length, y = sepal_width, color = cluster_no))
A little better…
ggplot(iris_cl) +
geom_point(aes(x = petal_length,
y = petal_width,
color = cluster_no,
pch = species)) +
scale_color_brewer(palette = "Set2")
Make it 3D with plot_ly()…
# Or, a 3D plot with plotly
plot_ly(x = iris_cl$petal_length,
y = iris_cl$petal_width,
z = iris_cl$sepal_width,
type = "scatter3d",
color = iris_cl$cluster_no,
symbol = ~iris_cl$species,
marker = list(size = 3),
colors = "Set1")
####Part 2. Cluster analysis: hierarchical
Hierarchical cluster analysis (dendrograms) in R
Relevant functions:
stats::hclust() - agglomerative hierarchical clustering cluster::diana() - divisive hierarchical clustering
We’ll be using WorldBank environmental data (simplified), wb_env.csv
# Get the data
wb_env <- read_csv(here("data", "wb_env.csv"))
# Only keep top 20 greenhouse gas emitters (for simplifying visualization here...)
wb_ghg_20 <- wb_env %>%
arrange(-ghg) %>%
head(20)
# Scale it (can consider this for k-means clustering, too...)
wb_scaled <- as.data.frame(scale(wb_ghg_20[3:7]))
# Update to add rownames (country name)
rownames(wb_scaled) <- wb_ghg_20$name
# Compute dissimilarity values (Euclidean distances):
diss <- dist(wb_scaled, method = "euclidean")
# Hierarchical clustering (complete linkage)
hc_complete <- hclust(diss, method = "complete" )
# Plot it (base plot):
plot(hc_complete, cex = 0.6, hang = -1)
Divisive clustering:
hc_div <- diana(diss)
plot(hc_div, hang = -1)
rect.hclust(hc_div, k = 4, border = 2:5)
We might want to compare those…because they differ slightly.
# Convert to class dendrogram
dend1 <- as.dendrogram(hc_complete)
dend2 <- as.dendrogram(hc_div)
# Combine into list
dend_list <- dendlist(dend1,dend2)
# Make a tanglegram
tanglegram(dend1, dend2)
# Convert to class 'dendro' for ggplotting
data1 <- dendro_data(hc_complete)
# Simple plot with ggdendrogram
ggdendrogram(hc_complete,
rotate = TRUE) +
theme_minimal() +
labs(x = "Country")
# Want to do it actually in ggplot? Here:
label_data <- bind_cols(filter(segment(data1), x == xend & x%%1 == 0), label(data1))
ggplot() +
geom_segment(data=segment(data1), aes(x=x, y=y, xend=xend, yend=yend)) +
geom_text(data=label_data, aes(x=xend, y=yend, label=label, hjust=0), size=2) +
coord_flip() +
scale_y_reverse(expand=c(0.2, 0)) +
theme_bw() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.position = "None")